library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggmosaic)

# set default theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 16))

# set default figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 8,
  fig.asp = 0.618,
  fig.retina = 2,
  dpi = 150, 
  out.width = "70%"
)

The Dataset: The Great American Coffee Taste Test

# load data
raw_coffee_data <- read_csv("data/coffee_survey.csv")
## Rows: 4042 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (44): submission_id, age, cups, where_drink, brew, brew_other, purchase,...
## dbl (13): expertise, coffee_a_bitterness, coffee_a_acidity, coffee_a_persona...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(raw_coffee_data)
## Rows: 4,042
## Columns: 57
## $ submission_id                <chr> "gMR29l", "BkPN0e", "W5G8jj", "4xWgGr", "…
## $ age                          <chr> "18-24 years old", "25-34 years old", "25…
## $ cups                         <chr> NA, NA, NA, NA, NA, NA, NA, NA, "Less tha…
## $ where_drink                  <chr> NA, NA, NA, NA, NA, NA, "At a cafe, At th…
## $ brew                         <chr> NA, "Pod/capsule machine (e.g. Keurig/Nes…
## $ brew_other                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ purchase                     <chr> NA, NA, NA, NA, NA, NA, "National chain (…
## $ purchase_other               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ favorite                     <chr> "Regular drip coffee", "Iced coffee", "Re…
## $ favorite_specify             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ additions                    <chr> "No - just black", "Sugar or sweetener, N…
## $ additions_other              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ dairy                        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ sweetener                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ style                        <chr> "Complex", "Light", "Complex", "Complex",…
## $ strength                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ roast_level                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ caffeine                     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ expertise                    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_a_bitterness          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_acidity             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_a_notes               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_b_bitterness          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_acidity             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_b_notes               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_c_bitterness          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_acidity             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_c_notes               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ coffee_d_bitterness          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_acidity             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_personal_preference <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 4, NA, NA…
## $ coffee_d_notes               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_abc                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_ad                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ prefer_overall               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ wfh                          <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ total_spend                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ why_drink                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ why_drink_other              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ taste                        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ know_source                  <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ most_paid                    <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ most_willing                 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ value_cafe                   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ spent_equipment              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ value_equipment              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ gender                       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ gender_specify               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ education_level              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ ethnicity_race               <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ ethnicity_race_specify       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ employment_status            <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ number_children              <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ political_affiliation        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
# clean and organize columns
coffee_data_clean <- raw_coffee_data %>%
  mutate(across(where(is.character), ~na_if(., "NA"))) %>%
  mutate(across(where(is.character), ~na_if(., "")))

coffee_data_clean <- coffee_data_clean %>%
  mutate(
    total_spend = as.numeric(total_spend)) %>%
  mutate(across(c
                (political_affiliation, 
                  education_level, 
                  ethnicity_race, 
                  gender, 
                  employment_status,
                  age),
                ~factor(.))) %>%
  mutate(
    brew = str_to_lower(brew),
    where_drink = str_to_lower(where_drink)) %>%
  mutate(cup_num = case_when(
    cups == "Less than 1" ~ 0,
    cups == "1" ~ 1,
    cups == "2" ~ 2,
    cups == "3" ~ 3,
    cups == "More than 4" ~ 5,
    cups == "4" ~ 4,
    TRUE ~ NA_real_
  ))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `total_spend = as.numeric(total_spend)`.
## Caused by warning:
## ! NAs introduced by coercion
distinct(raw_coffee_data, cups)
## # A tibble: 7 × 1
##   cups       
##   <chr>      
## 1 <NA>       
## 2 Less than 1
## 3 2          
## 4 1          
## 5 3          
## 6 More than 4
## 7 4
# remove predominately NA columns
prop_missing <- sapply(coffee_data_clean, function(x) mean(is.na(x)))
print(prop_missing)
##                submission_id                          age 
##                  0.000000000                  0.007669471 
##                         cups                  where_drink 
##                  0.023008412                  0.017318159 
##                         brew                   brew_other 
##                  0.095249876                  0.832261257 
##                     purchase               purchase_other 
##                  0.824344384                  0.992330529 
##                     favorite             favorite_specify 
##                  0.015338941                  0.971301336 
##                    additions              additions_other 
##                  0.020534389                  0.988124691 
##                        dairy                    sweetener 
##                  0.582879762                  0.873330035 
##                        style                     strength 
##                  0.020781791                  0.031172687 
##                  roast_level                     caffeine 
##                  0.025235032                  0.030925285 
##                    expertise          coffee_a_bitterness 
##                  0.025729837                  0.060366155 
##             coffee_a_acidity coffee_a_personal_preference 
##                  0.065066799                  0.062592776 
##               coffee_a_notes          coffee_b_bitterness 
##                  0.362196932                  0.064819396 
##             coffee_b_acidity coffee_b_personal_preference 
##                  0.068035626                  0.066551212 
##               coffee_b_notes          coffee_c_bitterness 
##                  0.392380010                  0.068777833 
##             coffee_c_acidity coffee_c_personal_preference 
##                  0.071994062                  0.068283028 
##               coffee_c_notes          coffee_d_bitterness 
##                  0.410440376                  0.068035626 
##             coffee_d_acidity coffee_d_personal_preference 
##                  0.068530430                  0.068777833 
##               coffee_d_notes                   prefer_abc 
##                  0.359722909                  0.066798615 
##                    prefer_ad               prefer_overall 
##                  0.069520040                  0.067293419 
##                          wfh                  total_spend 
##                  0.128154379                  1.000000000 
##                    why_drink              why_drink_other 
##                  0.117268679                  0.958683820 
##                        taste                  know_source 
##                  0.118505690                  0.119495299 
##                    most_paid                 most_willing 
##                  0.127412172                  0.131618011 
##                   value_cafe              spent_equipment 
##                  0.134092034                  0.132607620 
##              value_equipment                       gender 
##                  0.135576447                  0.128401781 
##               gender_specify              education_level 
##                  0.997031173                  0.149430975 
##               ethnicity_race       ethnicity_race_specify 
##                  0.154379020                  0.974022761 
##            employment_status              number_children 
##                  0.154131618                  0.157347848 
##        political_affiliation                      cup_num 
##                  0.186293914                  0.023008412
coffee_data_clean <- coffee_data_clean %>%
  select(where(~mean(is.na(.)) < 0.9))

# check dataset
str(coffee_data_clean)
## tibble [4,042 × 51] (S3: tbl_df/tbl/data.frame)
##  $ submission_id               : chr [1:4042] "gMR29l" "BkPN0e" "W5G8jj" "4xWgGr" ...
##  $ age                         : Factor w/ 7 levels "<18 years old",..: 3 4 4 5 4 7 3 NA 4 NA ...
##  $ cups                        : chr [1:4042] NA NA NA NA ...
##  $ where_drink                 : chr [1:4042] NA NA NA NA ...
##  $ brew                        : chr [1:4042] NA "pod/capsule machine (e.g. keurig/nespresso)" "bean-to-cup machine" "coffee brewing machine (e.g. mr. coffee)" ...
##  $ brew_other                  : chr [1:4042] NA NA NA NA ...
##  $ purchase                    : chr [1:4042] NA NA NA NA ...
##  $ favorite                    : chr [1:4042] "Regular drip coffee" "Iced coffee" "Regular drip coffee" "Iced coffee" ...
##  $ additions                   : chr [1:4042] "No - just black" "Sugar or sweetener, No - just black" "No - just black" "No - just black, Cinnamon" ...
##  $ dairy                       : chr [1:4042] NA NA NA NA ...
##  $ sweetener                   : chr [1:4042] NA NA NA NA ...
##  $ style                       : chr [1:4042] "Complex" "Light" "Complex" "Complex" ...
##  $ strength                    : chr [1:4042] NA NA NA NA ...
##  $ roast_level                 : chr [1:4042] NA NA NA NA ...
##  $ caffeine                    : chr [1:4042] NA NA NA NA ...
##  $ expertise                   : num [1:4042] NA NA NA NA NA NA NA NA NA NA ...
##  $ coffee_a_bitterness         : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_a_acidity            : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_a_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_a_notes              : chr [1:4042] NA NA NA NA ...
##  $ coffee_b_bitterness         : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_b_acidity            : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_b_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_b_notes              : chr [1:4042] NA NA NA NA ...
##  $ coffee_c_bitterness         : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_c_acidity            : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_c_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_c_notes              : chr [1:4042] NA NA NA NA ...
##  $ coffee_d_bitterness         : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_d_acidity            : num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_d_personal_preference: num [1:4042] NA NA NA NA NA NA NA NA 4 NA ...
##  $ coffee_d_notes              : chr [1:4042] NA NA NA NA ...
##  $ prefer_abc                  : chr [1:4042] NA NA NA NA ...
##  $ prefer_ad                   : chr [1:4042] NA NA NA NA ...
##  $ prefer_overall              : chr [1:4042] NA NA NA NA ...
##  $ wfh                         : chr [1:4042] NA NA NA NA ...
##  $ why_drink                   : chr [1:4042] NA NA NA NA ...
##  $ taste                       : chr [1:4042] NA NA NA NA ...
##  $ know_source                 : chr [1:4042] NA NA NA NA ...
##  $ most_paid                   : chr [1:4042] NA NA NA NA ...
##  $ most_willing                : chr [1:4042] NA NA NA NA ...
##  $ value_cafe                  : chr [1:4042] NA NA NA NA ...
##  $ spent_equipment             : chr [1:4042] NA NA NA NA ...
##  $ value_equipment             : chr [1:4042] NA NA NA NA ...
##  $ gender                      : Factor w/ 5 levels "Female","Male",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ education_level             : Factor w/ 6 levels "Bachelor's degree",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ ethnicity_race              : Factor w/ 6 levels "Asian/Pacific Islander",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ employment_status           : Factor w/ 6 levels "Employed full-time",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ number_children             : chr [1:4042] NA NA NA NA ...
##  $ political_affiliation       : Factor w/ 4 levels "Democrat","Independent",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ cup_num                     : num [1:4042] NA NA NA NA NA NA NA NA 0 NA ...
head(coffee_data_clean)
## # A tibble: 6 × 51
##   submission_id age         cups  where_drink brew  brew_other purchase favorite
##   <chr>         <fct>       <chr> <chr>       <chr> <chr>      <chr>    <chr>   
## 1 gMR29l        18-24 year… <NA>  <NA>        <NA>  <NA>       <NA>     Regular…
## 2 BkPN0e        25-34 year… <NA>  <NA>        pod/… <NA>       <NA>     Iced co…
## 3 W5G8jj        25-34 year… <NA>  <NA>        bean… <NA>       <NA>     Regular…
## 4 4xWgGr        35-44 year… <NA>  <NA>        coff… <NA>       <NA>     Iced co…
## 5 QD27Q8        25-34 year… <NA>  <NA>        pour… <NA>       <NA>     Latte   
## 6 V0LPeM        55-64 year… <NA>  <NA>        pod/… <NA>       <NA>     Iced co…
## # ℹ 43 more variables: additions <chr>, dairy <chr>, sweetener <chr>,
## #   style <chr>, strength <chr>, roast_level <chr>, caffeine <chr>,
## #   expertise <dbl>, coffee_a_bitterness <dbl>, coffee_a_acidity <dbl>,
## #   coffee_a_personal_preference <dbl>, coffee_a_notes <chr>,
## #   coffee_b_bitterness <dbl>, coffee_b_acidity <dbl>,
## #   coffee_b_personal_preference <dbl>, coffee_b_notes <chr>,
## #   coffee_c_bitterness <dbl>, coffee_c_acidity <dbl>, …

Description of Data

This data set comes from World Champion Barista James Hoffmann and coffee company Cometeer, who in 2023, collected survey responses corresponding to around 5000 taste testing kits distributed across the country about coffee preferences. I downloaded the data from Kaggle.

Research Question

Are there significant differences in economic coffee consumption patterns between individuals of different political affiliations within the same education or ethnic group?

Graph 1

cups_poli_edu <- ggplot(coffee_data_clean, aes(
  x = political_affiliation, 
  y = cup_num, 
  fill = political_affiliation)) +
  geom_boxplot() +
  facet_wrap(~ education_level) +
  labs(title = "Coffee Cups by Political Affiliation and Education Level",
       x = "Political Affiliation",
       y = "Cups of Coffee per Day",
       fill = "Political Affiliation") +
  theme_minimal()

ggplotly(cups_poli_edu)
## Warning: Removed 93 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Discussion of Graph 1 (200 words)

To create the above graph, I converted the cups variable from a character variable to a numeric by taking the median value of each subset of the data. I then chose to plot it in a box plot faceted across educational attainment and used color to show the distributions of coffee consumption daily by political affiliation. I think a lot can still be done to work on this graph, including improving color choice and reevaluating the labels on the x-axis. I think overall spacing is a struggle at the moment due to the wordy category types that I will need to address moving forward.

Draft Graph 2

convert_most_paid <- function(value) {
  case_when(
    value == "Less than $2" ~ 1,
    value == "More than $20" ~ 22,
    TRUE ~ {
      nums <- as.numeric(unlist(
        str_extract_all(value, "[0-9]+")))
      if (length(nums) == 2) {
        mean(nums) 
      } else {
        NA_real_
      }
    }
  )
}
coffee_data_clean <- coffee_data_clean %>%
  mutate(most_paid_numeric = sapply(
    most_paid, convert_most_paid))

ggplot(coffee_data_clean, aes(
  x = political_affiliation, 
  y = most_paid_numeric, 
  color = education_level)) +
  geom_jitter(width = 0.2, alpha = 0.5) +
  facet_wrap(~ ethnicity_race)
## Warning: Removed 515 rows containing missing values or values outside the scale range
## (`geom_point()`).

  labs(title = "Cafe Value for Money by Political Affiliation and Ethnicity",
       x = "Political Affiliation",
       y = "Most Paid for Cup of Coffee",
       fill = "Education Level")
## $x
## [1] "Political Affiliation"
## 
## $y
## [1] "Most Paid for Cup of Coffee"
## 
## $fill
## [1] "Education Level"
## 
## $title
## [1] "Cafe Value for Money by Political Affiliation and Ethnicity"
## 
## attr(,"class")
## [1] "labels"
  distinct(coffee_data_clean, most_paid)
## # A tibble: 9 × 1
##   most_paid    
##   <chr>        
## 1 <NA>         
## 2 $4-$6        
## 3 $2-$4        
## 4 $10-$15      
## 5 $6-$8        
## 6 $8-$10       
## 7 More than $20
## 8 $15-$20      
## 9 Less than $2

Draft Graph 3

distinct(coffee_data_clean, most_willing)
## # A tibble: 9 × 1
##   most_willing 
##   <chr>        
## 1 <NA>         
## 2 $8-$10       
## 3 More than $20
## 4 $15-$20      
## 5 $4-$6        
## 6 $6-$8        
## 7 $10-$15      
## 8 $2-$4        
## 9 Less than $2
ggplot(coffee_data_clean, aes(
  x = most_willing, 
  fill = education_level)) +
  geom_bar(position = "dodge") +
  facet_wrap(~ political_affiliation) +
  labs(
    title = "Most Willing to Pay for a Cup of Coffee by Political Affiliation",
    x = "Most Willing Price Range",
    y = "Count",
    fill = "Education Level"
  ) 

Discussion of Graphs 2 - 3 and Expansion

I think in regards to the draft graphs 2 and 3, I need to figure out better spacing or text size for the x-axis as the current format is very difficult to read. I like the idea of faceting these graphs across political affiliation or ethnic group, but I think when I consider their placement into a dashboard, I will need to think more critically about having multiple faceted visualizations and potentially look to visualize in such a way that does not need to be faceted or accounts for the interaction of the user as a means of separating out the interactions expressed by faceting above. Further graphs will definetly explore other economic related variables, as I am really interested in pulling out trends related to political affiliation, ethnicity, education and the idea of value or willingness to pay for coffee products.

Along with these graphs, I would like to create a Shiny Dashboard that would allow a viewer to select specific education levels or ethnic groups that would update visualizations to be able to further explore the visualizations above. I explored this type of dashboard in Tableau, and I really enjoyed the interactive element it incorporated into the dashboard so I would like to incorporate something similar into my final project through Shiny. Ideally, there will be 4-5 different visualizations and potentially a chart on the interactive dashboard.